{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "collapsed": true,
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%% \n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('train_sub.csv',dtype=object)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "test_df = pd.read_csv('test.csv',dtype=object)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "train_df.columns"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "data": {
      "text/plain": "               Unnamed: 0       id  click  hour  C1  banner_pos  site_id  \\\nsite_category                                                              \n0569f928             9077     9077      2    52   1           1        3   \n110ab22d                2        2      1     2   1           1        1   \n28905ebd          1817654  1817654      2    53   2           3       84   \n335d28a8            44296    44296      2    53   2           2      116   \n3e814130           736135   736135      2    53   1           3      603   \n42a36e14              343      343      2    48   1           1       14   \n50e219e0          4169502  4169502      2    53   7           6     1815   \n5378d028              105      105      2    30   1           2       11   \n70fb0e29             8249     8249      2    52   2           2        5   \n72722551            13868    13868      2    53   1           2       16   \n74073276                5        5      1     2   1           1        2   \n75fa27f6            28605    28605      2    53   1           2       26   \n76b2941d            27707    27707      2    53   2           2       31   \n8fd0aea4               93       93      2    29   1           1        4   \n9ccfa2ea               23       23      1    17   1           1        1   \na818d37a             1586     1586      2    50   1           1        6   \nbcf865d9              252      252      2    40   1           1        5   \nc0dd3be3             6405     6405      2    53   2           3       12   \nc706e647                2        2      1     1   1           1        1   \ndedf689d             4853     4853      2    52   2           2       12   \ne787de0e              367      367      2    21   1           2        9   \nf028772b          3038300  3038300      2    53   4           5      686   \nf66779e6            92571    92571      2    53   1           3       34   \n\n               site_domain  site_category  app_id  ...  device_type  \\\nsite_category                                      ...                \n0569f928                 3              1       1  ...            1   \n110ab22d                 1              1       1  ...            1   \n28905ebd                69              1       1  ...            1   \n335d28a8                95              1       1  ...            1   \n3e814130               579              1       1  ...            1   \n42a36e14                13              1       1  ...            1   \n50e219e0              3622              1    5469  ...            4   \n5378d028                11              1       1  ...            1   \n70fb0e29                 5              1       1  ...            1   \n72722551                16              1       1  ...            1   \n74073276                 1              1       1  ...            1   \n75fa27f6                25              1       1  ...            1   \n76b2941d                24              1       1  ...            1   \n8fd0aea4                 1              1       1  ...            1   \n9ccfa2ea                 1              1       1  ...            1   \na818d37a                 3              1       1  ...            1   \nbcf865d9                 1              1       1  ...            1   \nc0dd3be3                13              1       1  ...            1   \nc706e647                 1              1       1  ...            1   \ndedf689d                12              1       1  ...            1   \ne787de0e                 9              1       1  ...            1   \nf028772b               523              1       1  ...            3   \nf66779e6                33              1       1  ...            1   \n\n               device_conn_type  C14  C15  C16  C17  C18  C19  C20  C21  \nsite_category                                                            \n0569f928                      2   78    2    2   29    4   15   34   17  \n110ab22d                      1    1    1    1    1    1    1    1    1  \n28905ebd                      2  394    4    4  114    4   25  159   34  \n335d28a8                      2  286    4    4   93    4   26  142   31  \n3e814130                      2  397    5    5  118    4   29  154   35  \n42a36e14                      2   53    2    3   32    3   15   35   20  \n50e219e0                      4  893    8    9  196    4   41  168   41  \n5378d028                      1   29    3    3   16    4    8    6   12  \n70fb0e29                      2   85    2    2   43    4   17   43   21  \n72722551                      2  107    3    3   38    4   13   44   18  \n74073276                      1    3    1    1    2    1    2    1    2  \n75fa27f6                      2  201    4    4   68    4   21  108   28  \n76b2941d                      2   89    3    3   41    4   15   61   22  \n8fd0aea4                      2   24    2    2   16    4   12   13   12  \n9ccfa2ea                      2    2    1    1    2    1    2    2    2  \na818d37a                      2   17    2    2    7    3    3   11    5  \nbcf865d9                      2    6    2    2    4    1    4    5    4  \nc0dd3be3                      2  121    4    4   47    4   18   30   21  \nc706e647                      1    1    1    1    1    1    1    2    1  \ndedf689d                      2   94    3    3   45    4   19   73   24  \ne787de0e                      2   21    2    2   13    3    7   14    7  \nf028772b                      2  584    5    6  149    4   32  161   36  \nf66779e6                      2  270    4    4   95    4   26  145   32  \n\n[23 rows x 25 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0</th>\n      <th>id</th>\n      <th>click</th>\n      <th>hour</th>\n      <th>C1</th>\n      <th>banner_pos</th>\n      <th>site_id</th>\n      <th>site_domain</th>\n      <th>site_category</th>\n      <th>app_id</th>\n      <th>...</th>\n      <th>device_type</th>\n      <th>device_conn_type</th>\n      <th>C14</th>\n      <th>C15</th>\n      <th>C16</th>\n      <th>C17</th>\n      <th>C18</th>\n      <th>C19</th>\n      <th>C20</th>\n      <th>C21</th>\n    </tr>\n    <tr>\n      <th>site_category</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0569f928</th>\n      <td>9077</td>\n      <td>9077</td>\n      <td>2</td>\n      <td>52</td>\n      <td>1</td>\n      <td>1</td>\n      <td>3</td>\n      <td>3</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>78</td>\n      <td>2</td>\n      <td>2</td>\n      <td>29</td>\n      <td>4</td>\n      <td>15</td>\n      <td>34</td>\n      <td>17</td>\n    </tr>\n    <tr>\n      <th>110ab22d</th>\n      <td>2</td>\n      <td>2</td>\n      <td>1</td>\n      <td>2</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>28905ebd</th>\n      <td>1817654</td>\n      <td>1817654</td>\n      <td>2</td>\n      <td>53</td>\n      <td>2</td>\n      <td>3</td>\n      <td>84</td>\n      <td>69</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>394</td>\n      <td>4</td>\n      <td>4</td>\n      <td>114</td>\n      <td>4</td>\n      <td>25</td>\n      <td>159</td>\n      <td>34</td>\n    </tr>\n    <tr>\n      <th>335d28a8</th>\n      <td>44296</td>\n      <td>44296</td>\n      <td>2</td>\n      <td>53</td>\n      <td>2</td>\n      <td>2</td>\n      <td>116</td>\n      <td>95</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>286</td>\n      <td>4</td>\n      <td>4</td>\n      <td>93</td>\n      <td>4</td>\n      <td>26</td>\n      <td>142</td>\n      <td>31</td>\n    </tr>\n    <tr>\n      <th>3e814130</th>\n      <td>736135</td>\n      <td>736135</td>\n      <td>2</td>\n      <td>53</td>\n      <td>1</td>\n      <td>3</td>\n      <td>603</td>\n      <td>579</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>397</td>\n      <td>5</td>\n      <td>5</td>\n      <td>118</td>\n      <td>4</td>\n      <td>29</td>\n      <td>154</td>\n      <td>35</td>\n    </tr>\n    <tr>\n      <th>42a36e14</th>\n      <td>343</td>\n      <td>343</td>\n      <td>2</td>\n      <td>48</td>\n      <td>1</td>\n      <td>1</td>\n      <td>14</td>\n      <td>13</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>53</td>\n      <td>2</td>\n      <td>3</td>\n      <td>32</td>\n      <td>3</td>\n      <td>15</td>\n      <td>35</td>\n      <td>20</td>\n    </tr>\n    <tr>\n      <th>50e219e0</th>\n      <td>4169502</td>\n      <td>4169502</td>\n      <td>2</td>\n      <td>53</td>\n      <td>7</td>\n      <td>6</td>\n      <td>1815</td>\n      <td>3622</td>\n      <td>1</td>\n      <td>5469</td>\n      <td>...</td>\n      <td>4</td>\n      <td>4</td>\n      <td>893</td>\n      <td>8</td>\n      <td>9</td>\n      <td>196</td>\n      <td>4</td>\n      <td>41</td>\n      <td>168</td>\n      <td>41</td>\n    </tr>\n    <tr>\n      <th>5378d028</th>\n      <td>105</td>\n      <td>105</td>\n      <td>2</td>\n      <td>30</td>\n      <td>1</td>\n      <td>2</td>\n      <td>11</td>\n      <td>11</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>1</td>\n      <td>29</td>\n      <td>3</td>\n      <td>3</td>\n      <td>16</td>\n      <td>4</td>\n      <td>8</td>\n      <td>6</td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <th>70fb0e29</th>\n      <td>8249</td>\n      <td>8249</td>\n      <td>2</td>\n      <td>52</td>\n      <td>2</td>\n      <td>2</td>\n      <td>5</td>\n      <td>5</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>85</td>\n      <td>2</td>\n      <td>2</td>\n      <td>43</td>\n      <td>4</td>\n      <td>17</td>\n      <td>43</td>\n      <td>21</td>\n    </tr>\n    <tr>\n      <th>72722551</th>\n      <td>13868</td>\n      <td>13868</td>\n      <td>2</td>\n      <td>53</td>\n      <td>1</td>\n      <td>2</td>\n      <td>16</td>\n      <td>16</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>107</td>\n      <td>3</td>\n      <td>3</td>\n      <td>38</td>\n      <td>4</td>\n      <td>13</td>\n      <td>44</td>\n      <td>18</td>\n    </tr>\n    <tr>\n      <th>74073276</th>\n      <td>5</td>\n      <td>5</td>\n      <td>1</td>\n      <td>2</td>\n      <td>1</td>\n      <td>1</td>\n      <td>2</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>1</td>\n      <td>3</td>\n      <td>1</td>\n      <td>1</td>\n      <td>2</td>\n      <td>1</td>\n      <td>2</td>\n      <td>1</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>75fa27f6</th>\n      <td>28605</td>\n      <td>28605</td>\n      <td>2</td>\n      <td>53</td>\n      <td>1</td>\n      <td>2</td>\n      <td>26</td>\n      <td>25</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>201</td>\n      <td>4</td>\n      <td>4</td>\n      <td>68</td>\n      <td>4</td>\n      <td>21</td>\n      <td>108</td>\n      <td>28</td>\n    </tr>\n    <tr>\n      <th>76b2941d</th>\n      <td>27707</td>\n      <td>27707</td>\n      <td>2</td>\n      <td>53</td>\n      <td>2</td>\n      <td>2</td>\n      <td>31</td>\n      <td>24</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>89</td>\n      <td>3</td>\n      <td>3</td>\n      <td>41</td>\n      <td>4</td>\n      <td>15</td>\n      <td>61</td>\n      <td>22</td>\n    </tr>\n    <tr>\n      <th>8fd0aea4</th>\n      <td>93</td>\n      <td>93</td>\n      <td>2</td>\n      <td>29</td>\n      <td>1</td>\n      <td>1</td>\n      <td>4</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>24</td>\n      <td>2</td>\n      <td>2</td>\n      <td>16</td>\n      <td>4</td>\n      <td>12</td>\n      <td>13</td>\n      <td>12</td>\n    </tr>\n    <tr>\n      <th>9ccfa2ea</th>\n      <td>23</td>\n      <td>23</td>\n      <td>1</td>\n      <td>17</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>2</td>\n      <td>1</td>\n      <td>1</td>\n      <td>2</td>\n      <td>1</td>\n      <td>2</td>\n      <td>2</td>\n      <td>2</td>\n    </tr>\n    <tr>\n      <th>a818d37a</th>\n      <td>1586</td>\n      <td>1586</td>\n      <td>2</td>\n      <td>50</td>\n      <td>1</td>\n      <td>1</td>\n      <td>6</td>\n      <td>3</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>17</td>\n      <td>2</td>\n      <td>2</td>\n      <td>7</td>\n      <td>3</td>\n      <td>3</td>\n      <td>11</td>\n      <td>5</td>\n    </tr>\n    <tr>\n      <th>bcf865d9</th>\n      <td>252</td>\n      <td>252</td>\n      <td>2</td>\n      <td>40</td>\n      <td>1</td>\n      <td>1</td>\n      <td>5</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>6</td>\n      <td>2</td>\n      <td>2</td>\n      <td>4</td>\n      <td>1</td>\n      <td>4</td>\n      <td>5</td>\n      <td>4</td>\n    </tr>\n    <tr>\n      <th>c0dd3be3</th>\n      <td>6405</td>\n      <td>6405</td>\n      <td>2</td>\n      <td>53</td>\n      <td>2</td>\n      <td>3</td>\n      <td>12</td>\n      <td>13</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>121</td>\n      <td>4</td>\n      <td>4</td>\n      <td>47</td>\n      <td>4</td>\n      <td>18</td>\n      <td>30</td>\n      <td>21</td>\n    </tr>\n    <tr>\n      <th>c706e647</th>\n      <td>2</td>\n      <td>2</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>dedf689d</th>\n      <td>4853</td>\n      <td>4853</td>\n      <td>2</td>\n      <td>52</td>\n      <td>2</td>\n      <td>2</td>\n      <td>12</td>\n      <td>12</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>94</td>\n      <td>3</td>\n      <td>3</td>\n      <td>45</td>\n      <td>4</td>\n      <td>19</td>\n      <td>73</td>\n      <td>24</td>\n    </tr>\n    <tr>\n      <th>e787de0e</th>\n      <td>367</td>\n      <td>367</td>\n      <td>2</td>\n      <td>21</td>\n      <td>1</td>\n      <td>2</td>\n      <td>9</td>\n      <td>9</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>21</td>\n      <td>2</td>\n      <td>2</td>\n      <td>13</td>\n      <td>3</td>\n      <td>7</td>\n      <td>14</td>\n      <td>7</td>\n    </tr>\n    <tr>\n      <th>f028772b</th>\n      <td>3038300</td>\n      <td>3038300</td>\n      <td>2</td>\n      <td>53</td>\n      <td>4</td>\n      <td>5</td>\n      <td>686</td>\n      <td>523</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>3</td>\n      <td>2</td>\n      <td>584</td>\n      <td>5</td>\n      <td>6</td>\n      <td>149</td>\n      <td>4</td>\n      <td>32</td>\n      <td>161</td>\n      <td>36</td>\n    </tr>\n    <tr>\n      <th>f66779e6</th>\n      <td>92571</td>\n      <td>92571</td>\n      <td>2</td>\n      <td>53</td>\n      <td>1</td>\n      <td>3</td>\n      <td>34</td>\n      <td>33</td>\n      <td>1</td>\n      <td>1</td>\n      <td>...</td>\n      <td>1</td>\n      <td>2</td>\n      <td>270</td>\n      <td>4</td>\n      <td>4</td>\n      <td>95</td>\n      <td>4</td>\n      <td>26</td>\n      <td>145</td>\n      <td>32</td>\n    </tr>\n  </tbody>\n</table>\n<p>23 rows × 25 columns</p>\n</div>"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 5
    }
   ],
   "source": [
    "train_df.groupby(by='site_category').nunique()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "outputs": [
    {
     "data": {
      "text/plain": "False"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 49
    }
   ],
   "source": [
    "train_df['id'].is_monotonic\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "outputs": [
    {
     "data": {
      "text/plain": "'00'"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 82
    }
   ],
   "source": [
    "train_df['hour'][2][6:8]"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "outputs": [
    {
     "data": {
      "text/plain": "6    14102100\n7    14102100\nName: hour, dtype: object"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 81
    }
   ],
   "source": [
    "train_df['hour'][6:8]"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "data": {
      "text/plain": "a99f214a    8148098\nc357dbff      17287\n936e92fb       4627\nafeffc18       1645\ncef4c8cc       1258\n             ...   \nf719fc27          1\n09b76c8c          1\na9f66aba          1\n662e02f5          1\n36cfc709          1\nName: device_id, Length: 786741, dtype: int64"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 4
    }
   ],
   "source": [
    "#800倍？？？？\n",
    "train_df['device_id'].value_counts()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "data": {
      "text/plain": "ecad2386    6190353\ne2fcccd2     280797\n7358e05e     272138\na5184c22     209689\nfebd1138     177939\n             ...   \n66308a77          1\n4a8c7b17          1\nc4c44801          1\n238dd06d          1\nb80edcba          1\nName: app_id, Length: 5469, dtype: int64"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 5
    }
   ],
   "source": [
    "train_df['app_id'].value_counts()\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "outputs": [
    {
     "data": {
      "text/plain": "<bound method NDFrame.describe of 0          a99f214a\n1          a99f214a\n2          a99f214a\n3          a99f214a\n4          a99f214a\n             ...   \n9999995    a99f214a\n9999996    a99f214a\n9999997    709a50de\n9999998    a99f214a\n9999999    a4970f29\nName: device_id, Length: 10000000, dtype: object>"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 17
    }
   ],
   "source": [
    "train_df['device_id'].describe\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "count_s = 0\n",
    "def get_id(df):\n",
    "    global count_s\n",
    "    count_s += 1\n",
    "    if count_s%1000000 == 0:\n",
    "        print(count_s)\n",
    "    if df['device_id'] != \"a99f214a\":\n",
    "        return df['device_id']\n",
    "    else:\n",
    "        return df['device_ip'] + \"_\" + df['device_model']\n",
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-4-66df75634def>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m#device_ip', 'device_model', 'device_type', 'device_conn_type\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mtrain_df\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'user_id'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m\u001b[0mtrain_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mget_id\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mNameError\u001b[0m: name 'train_df' is not defined"
     ],
     "ename": "NameError",
     "evalue": "name 'train_df' is not defined",
     "output_type": "error"
    }
   ],
   "source": [
    "#device_ip', 'device_model', 'device_type', 'device_conn_type\n",
    "train_df['user_id'] =train_df.apply(get_id,axis=1)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "name": "stdout",
     "text": [
      "1000000\n",
      "2000000\n",
      "3000000\n",
      "4000000\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "test_df['user_id'] =test_df.apply(get_id,axis=1)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [
    {
     "data": {
      "text/plain": "0     ddd2926e_44956a24\n1     96809ac8_711ee120\n2     b3cf8def_8a4875bd\n3     e8275b8f_6332421a\n4     9644d0bf_779d90c2\n            ...        \n95    03108db9_a0f5f879\n96    0b697be1_1f0bc64f\n97    58db4f0c_6332421a\n98    02b9b0fc_1aa0e912\n99    6b9769f2_4c8aeb60\nName: user_id, Length: 100, dtype: object"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 6
    }
   ],
   "source": [
    "train_df['user_id']"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [
    "count_s = 0\n",
    "day = \"??\"\n",
    "hour = \"??\"\n",
    "d = defaultdict(int)\n",
    "d2 = {}\n",
    "dh = defaultdict(int)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "def get_prop(df):\n",
    "    global count_s,day,hour,d,d2,dh\n",
    "    count_s += 1\n",
    "    if count_s%1000000 == 0:\n",
    "        print(count_s)\n",
    "    if str(df['hour'])[4:6] !=day:\n",
    "        d.clear()\n",
    "        d = defaultdict(int)\n",
    "        d2 = {}\n",
    "        day = df['hour'][4:6]\n",
    "    if df['hour'][6:8] !=hour:\n",
    "        dh.clear()\n",
    "        dh = defaultdict(int)\n",
    "        hour = df['hour'][6:8]\n",
    "    time = int(df['hour'][6:8]) * 60 + int(float(df['id'][:5])/100000.0 * 60)\n",
    "    d[df['user_id']+'_n_'+df['C14']] +=1\n",
    "    d[df['user_id']+'_q_'+df['C17']] +=1\n",
    "    dh[df['user_id']+'_n_'+df['C14']] +=1\n",
    "    dh[df['user_id']+'_q_'+df['C17']] +=1\n",
    "    dh[df['user_id']]+=1\n",
    "    \n",
    "    media_id = 'f_'+df['app_id']\n",
    "    if media_id == \"ecad2386\":\n",
    "        media_id = 'c_'+df['site_id']\n",
    "    \n",
    "    d[df['user_id']+\"_\"+media_id] +=1\n",
    "    t = '-1'\n",
    "    if df['user_id'] not in d2:\n",
    "        d2[df['user_id']] = time\n",
    "    else:\n",
    "        t = str(time - d2[df['user_id']])\n",
    "        d2[df['user_id']] = time\n",
    "    m = d[df['user_id']+'_'+media_id]\n",
    "    c = d[df['user_id']+'_n_'+df['C14']]\n",
    "    c2 = d[df['user_id']+'_q_'+df['C17']]\n",
    "    ch = dh[df['user_id']+'_n_'+df['C14']]\n",
    "    ch1 = dh[df['user_id']+'_q_'+df['C17']]\n",
    "    return str(m),str(c),str(c2),str(ch),str(ch1),t"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "train_df[['user_id&media_id','user_id&C14_d','user_id&C17_d','user_id&C14_h','user_id&C17_h','time']]=train_df.apply(get_prop,axis=1,result_type='expand')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [
    {
     "name": "stdout",
     "text": [
      "1000000\n",
      "2000000\n",
      "3000000\n",
      "4000000\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "test_df[['user_id&media_id','user_id&C14_d','user_id&C17_d','user_id&C14_h','user_id&C17_h','time']]=test_df.apply(get_prop,axis=1,result_type='expand')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [
    {
     "data": {
      "text/plain": "1       3715991\n2       1451370\n3        822163\n4        538719\n5        383318\n         ...   \n8101          1\n8168          1\n9030          1\n8109          1\n8857          1\nName: user_id&media_id, Length: 9333, dtype: int64"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 9
    }
   ],
   "source": [
    "train_df['user_id&media_id'].value_counts()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [
    {
     "data": {
      "text/plain": "  Unnamed: 0                      id click      hour    C1 banner_pos  \\\n0          0  1.0000094181510943e+18     0  14102100  1005          0   \n1          1  1.0000169349117864e+19     0  14102100  1005          0   \n2          2   1.000037190421512e+19     0  14102100  1005          0   \n3          3   1.000064072448084e+19     0  14102100  1005          0   \n4          4  1.0000679056417042e+19     0  14102100  1005          1   \n\n    site_id site_domain site_category    app_id  ... C19     C20  C21  \\\n0  1fbe01fe    f3845767      28905ebd  ecad2386  ...  35      -1   79   \n1  1fbe01fe    f3845767      28905ebd  ecad2386  ...  35  100084   79   \n2  1fbe01fe    f3845767      28905ebd  ecad2386  ...  35  100084   79   \n3  1fbe01fe    f3845767      28905ebd  ecad2386  ...  35  100084   79   \n4  fe8cc448    9166c161      0569f928  ecad2386  ...  35      -1  157   \n\n             user_id user_id&media_id user_id&C14_d user_id&C17_d  \\\n0  ddd2926e_44956a24                1             1             1   \n1  96809ac8_711ee120                1             1             1   \n2  b3cf8def_8a4875bd                1             1             1   \n3  e8275b8f_6332421a                1             1             1   \n4  9644d0bf_779d90c2                1             1             1   \n\n  user_id&C14_h user_id&C17_h time  \n0             1             1   -1  \n1             1             1   -1  \n2             1             1   -1  \n3             1             1   -1  \n4             1             1   -1  \n\n[5 rows x 32 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>Unnamed: 0</th>\n      <th>id</th>\n      <th>click</th>\n      <th>hour</th>\n      <th>C1</th>\n      <th>banner_pos</th>\n      <th>site_id</th>\n      <th>site_domain</th>\n      <th>site_category</th>\n      <th>app_id</th>\n      <th>...</th>\n      <th>C19</th>\n      <th>C20</th>\n      <th>C21</th>\n      <th>user_id</th>\n      <th>user_id&amp;media_id</th>\n      <th>user_id&amp;C14_d</th>\n      <th>user_id&amp;C17_d</th>\n      <th>user_id&amp;C14_h</th>\n      <th>user_id&amp;C17_h</th>\n      <th>time</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0</td>\n      <td>1.0000094181510943e+18</td>\n      <td>0</td>\n      <td>14102100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>1fbe01fe</td>\n      <td>f3845767</td>\n      <td>28905ebd</td>\n      <td>ecad2386</td>\n      <td>...</td>\n      <td>35</td>\n      <td>-1</td>\n      <td>79</td>\n      <td>ddd2926e_44956a24</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>1</td>\n      <td>1.0000169349117864e+19</td>\n      <td>0</td>\n      <td>14102100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>1fbe01fe</td>\n      <td>f3845767</td>\n      <td>28905ebd</td>\n      <td>ecad2386</td>\n      <td>...</td>\n      <td>35</td>\n      <td>100084</td>\n      <td>79</td>\n      <td>96809ac8_711ee120</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>2</td>\n      <td>1.000037190421512e+19</td>\n      <td>0</td>\n      <td>14102100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>1fbe01fe</td>\n      <td>f3845767</td>\n      <td>28905ebd</td>\n      <td>ecad2386</td>\n      <td>...</td>\n      <td>35</td>\n      <td>100084</td>\n      <td>79</td>\n      <td>b3cf8def_8a4875bd</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>3</td>\n      <td>1.000064072448084e+19</td>\n      <td>0</td>\n      <td>14102100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>1fbe01fe</td>\n      <td>f3845767</td>\n      <td>28905ebd</td>\n      <td>ecad2386</td>\n      <td>...</td>\n      <td>35</td>\n      <td>100084</td>\n      <td>79</td>\n      <td>e8275b8f_6332421a</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>4</td>\n      <td>1.0000679056417042e+19</td>\n      <td>0</td>\n      <td>14102100</td>\n      <td>1005</td>\n      <td>1</td>\n      <td>fe8cc448</td>\n      <td>9166c161</td>\n      <td>0569f928</td>\n      <td>ecad2386</td>\n      <td>...</td>\n      <td>35</td>\n      <td>-1</td>\n      <td>157</td>\n      <td>9644d0bf_779d90c2</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 32 columns</p>\n</div>"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 14
    }
   ],
   "source": [
    "train_df.head()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "train_df.to_csv(\"train_sub_count.csv\")"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [
    {
     "data": {
      "text/plain": "                     id      hour    C1 banner_pos   site_id site_domain  \\\n0  10000174058809263569  14103100  1005          0  235ba823    f6ebf28e   \n1  10000182526920855428  14103100  1005          0  1fbe01fe    f3845767   \n2  10000554139829213984  14103100  1005          0  1fbe01fe    f3845767   \n3  10001094637809798845  14103100  1005          0  85f751fd    c4e18dd6   \n4  10001377041558670745  14103100  1005          0  85f751fd    c4e18dd6   \n\n  site_category    app_id app_domain app_category  ...  C19     C20  C21  \\\n0      f028772b  ecad2386   7801e8d9     07d7df22  ...  175  100075   23   \n1      28905ebd  ecad2386   7801e8d9     07d7df22  ...   35  100083   51   \n2      28905ebd  ecad2386   7801e8d9     07d7df22  ...   35  100083   51   \n3      50e219e0  51cedd4e   aefc06bd     0f2161f8  ...  809  100156   61   \n4      50e219e0  9c13b419   2347f47a     f95efa07  ...   47      -1  221   \n\n             user_id user_id&media_id user_id&C14_d user_id&C17_d  \\\n0  69f45779_0eb711ec                1             1             1   \n1  e8d44657_ecb851b2                1             1             1   \n2  10fb085b_1f0bc64f                1             1             1   \n3  422d257a_542422a7                1             1             1   \n4  078c6b38_1f0bc64f                1             1             1   \n\n  user_id&C14_h user_id&C17_h time  \n0             1             1   -1  \n1             1             1   -1  \n2             1             1   -1  \n3             1             1   -1  \n4             1             1   -1  \n\n[5 rows x 30 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>id</th>\n      <th>hour</th>\n      <th>C1</th>\n      <th>banner_pos</th>\n      <th>site_id</th>\n      <th>site_domain</th>\n      <th>site_category</th>\n      <th>app_id</th>\n      <th>app_domain</th>\n      <th>app_category</th>\n      <th>...</th>\n      <th>C19</th>\n      <th>C20</th>\n      <th>C21</th>\n      <th>user_id</th>\n      <th>user_id&amp;media_id</th>\n      <th>user_id&amp;C14_d</th>\n      <th>user_id&amp;C17_d</th>\n      <th>user_id&amp;C14_h</th>\n      <th>user_id&amp;C17_h</th>\n      <th>time</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>10000174058809263569</td>\n      <td>14103100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>235ba823</td>\n      <td>f6ebf28e</td>\n      <td>f028772b</td>\n      <td>ecad2386</td>\n      <td>7801e8d9</td>\n      <td>07d7df22</td>\n      <td>...</td>\n      <td>175</td>\n      <td>100075</td>\n      <td>23</td>\n      <td>69f45779_0eb711ec</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>10000182526920855428</td>\n      <td>14103100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>1fbe01fe</td>\n      <td>f3845767</td>\n      <td>28905ebd</td>\n      <td>ecad2386</td>\n      <td>7801e8d9</td>\n      <td>07d7df22</td>\n      <td>...</td>\n      <td>35</td>\n      <td>100083</td>\n      <td>51</td>\n      <td>e8d44657_ecb851b2</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>10000554139829213984</td>\n      <td>14103100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>1fbe01fe</td>\n      <td>f3845767</td>\n      <td>28905ebd</td>\n      <td>ecad2386</td>\n      <td>7801e8d9</td>\n      <td>07d7df22</td>\n      <td>...</td>\n      <td>35</td>\n      <td>100083</td>\n      <td>51</td>\n      <td>10fb085b_1f0bc64f</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>10001094637809798845</td>\n      <td>14103100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>85f751fd</td>\n      <td>c4e18dd6</td>\n      <td>50e219e0</td>\n      <td>51cedd4e</td>\n      <td>aefc06bd</td>\n      <td>0f2161f8</td>\n      <td>...</td>\n      <td>809</td>\n      <td>100156</td>\n      <td>61</td>\n      <td>422d257a_542422a7</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>10001377041558670745</td>\n      <td>14103100</td>\n      <td>1005</td>\n      <td>0</td>\n      <td>85f751fd</td>\n      <td>c4e18dd6</td>\n      <td>50e219e0</td>\n      <td>9c13b419</td>\n      <td>2347f47a</td>\n      <td>f95efa07</td>\n      <td>...</td>\n      <td>47</td>\n      <td>-1</td>\n      <td>221</td>\n      <td>078c6b38_1f0bc64f</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>1</td>\n      <td>-1</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 30 columns</p>\n</div>"
     },
     "metadata": {},
     "output_type": "execute_result",
     "execution_count": 9
    }
   ],
   "source": [
    "test_df.head()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [],
   "source": [
    "test_df.to_csv(\"test_sub_count.csv\",index=False)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [
    "\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n"
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "language": "python",
   "display_name": "Python 3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}